
import requests
import re
from bs4 import BeautifulSoup
def news_crawler():
    url='https://www.163.com/'
    data=requests.get(url).text
    soup=BeautifulSoup(data,'lxml')
    dl=re.findall(r'news_default_yw.*?</div>',str(soup),re.S)[0]
    news_list=re.findall((r'<a href="(.*?)">(.*?)</a>'),dl,re.S)
    for news in news_list[1:2]:
        news_url,news_title=news
        print(news_title+'   '+news_url)
        html_news=str(BeautifulSoup(requests.get(news_url).text,'lxml'))
        #print(html_news)
        main = re.findall(r'post_text.*?</div>', html_news, re.S)[0]
        #print(content1)
        content,resource,author = re.findall(r'<p class="otitle">(.*?)<!-- 作者 -->.*本文来源：(.*?)</span>.*责任编辑：(.*?)</span>.*</div>', html_news, re.S)[0]
        print(content,resource,author)
    return True

news_crawler()